import numpy as np
import pandas as pd
import tensorflow_hub as hub
import tensorflow_text
from tensorflow.keras.models import Sequential, load_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from skimage.transform import resize
import networkx as nx
from node2vec import Node2Vec
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from termcolor import colored
from tqdm.notebook import tqdm
import re
import nltk
import pickle
import swifter
import random
from os import listdir
from os.path import isfile, join
from collections import Counter
from random import randint
def getRoundedThresholdv1(a, MinClip):
return round(float(a) / MinClip) * MinClip
def load_all_models(verbose=True):
cluster_seq_emb = load_model('cluster_seq_emb.h5')
if verbose:
print('dialog embeder is here')
with open('kmeans_phrases.pkl', 'rb') as f:
clust_model_kmeans = pickle.load(f)
with open('kmeans_phrases_names.pkl', 'rb') as f:
cluster_names = pickle.load(f)
if verbose:
print('sent cluster model is here')
with open('kmeans_seqs.pkl', 'rb') as f:
ngram_kmeans = pickle.load(f)
with open('kmeans_seqs_names.pkl', 'rb') as f:
ngram_clusters_names = pickle.load(f)
if verbose:
print('dialog cluster model is here')
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
if verbose:
print('thats all')
return cluster_seq_emb, clust_model_kmeans, cluster_names, ngram_kmeans, ngram_clusters_names, embed
cluster_seq_emb, clust_model_kmeans, cluster_names, ngram_kmeans, ngram_clusters_names, embed = load_all_models(verbose=True)
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
# Some texts of different lengths.
english_sentences = ["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."]
italian_sentences = ["cane", "I cuccioli sono carini.", "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane."]
japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
# Compute embeddings.
en_result = embed(english_sentences)
it_result = embed(italian_sentences)
ja_result = embed(japanese_sentences)
# Compute similarity matrix. Higher score indicates greater similarity.
similarity_matrix_it = np.inner(en_result, it_result)
similarity_matrix_ja = np.inner(en_result, ja_result)
dramas = [f for f in listdir('drama/') if isfile(join('drama/', f))]
dramas_texts = []
for drama in dramas:
with open(join('drama/', drama), 'rb') as f:
dramas_texts.append(pickle.load(f))
dramas_texts[0][0][:5]
df_with_years = pd.read_csv('dracor-rus-list.csv', sep=',')
df_with_years.fillna(1000000, inplace=True)
df_with_years['year'] = df_with_years.apply(lambda x: min(x.written, x.premiered, x.printed), axis=1)
df_with_years['name'] = [link.split('/')[-1] for link in df_with_years['link']]
name_to_year = {name: year for name, year in zip(df_with_years['name'], df_with_years['year'])}
years = [name_to_year[name.split('.')[0]] for name in dramas]
years[:5]
?sns.distplot
fig, ax = plt.subplots()
fig.set_size_inches(10, 5)
years_rnd = [getRoundedThresholdv1(year, 5) for year in years]
sns.distplot(years_rnd, bins=40, kde=False, color='indigo')
only_sents = []
for drama in tqdm(dramas_texts):
for part in drama:
only_sents = only_sents + [sent for _, sent in part]
only_sents = [sent for sent in only_sents if len(nltk.word_tokenize(sent))>0]
only_sents[:5]
def get_sent_vector(sent):
return np.array(embed([sent])[0])
get_sent_vector('Извольте, сударь')
vectors = [get_sent_vector(sent) for sent in tqdm(only_sents)]
sent_to_vector_dict = {sent: vec for sent, vec in tqdm(zip(only_sents, vectors))}
# for clust_num in [5, 10, 20, 40, 60, 100, 200]:
for clust_num in [50]:
# clust_model_kmeans = KMeans(n_clusters=clust_num, random_state=42, precompute_distances=True)
# clust_model_kmeans.fit(vectors)
clusters = clust_model_kmeans.predict(vectors)
# print('{}: {}'.format(clust_num, silhouette_score(X, clusters)))
sents_with_clusters = list(zip(only_sents, clust_model_kmeans.predict(vectors)))
# cluster_names = {}
for curr_cluster in [37]:
print(curr_cluster)
sents_in_this_cluster = [sent for sent, cluster in sents_with_clusters if cluster==curr_cluster]
for sent in random.choices(sents_in_this_cluster, k=20):
print(sent)
cluster_names[curr_cluster] = input()
print('=========================\n=========================\n=========================\n\n\n')
cluster_names
def cluster_text(per_text, cluster_model, cluster_names={}, sent_to_vector_dict={}):
output = []
all_clusters = []
for person, sent in per_text:
if sent in sent_to_vector_dict:
sent_vec = sent_to_vector_dict[sent]
else:
sent_vec = get_sent_vector(sent)
if sent_vec is not None:
cluster = clust_model_kmeans.predict([sent_vec])[0]
sents = {sent: cluster}
output.append((person, sents))
all_clusters.append(cluster)
return output, all_clusters
parsed_text, _ = cluster_text(dramas_texts[201][2], clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
for person, rep in parsed_text:
print(person)
for phrase in rep:
print(phrase + ': ' + colored(rep[phrase], 'red'))
print('\n')
list(sent_to_vector_dict)[:10]
drama_lens = [sum([len(scene) for scene in drama]) for drama in dramas_texts]
list(zip(range(210), dramas, drama_lens))
# dramas_names = []
cluster_vecs = []
# for i, drama in tqdm(enumerate([dramas[11]])):
for part_num, part in enumerate(dramas_texts[113][:1]):
# dramas_names.append(drama.split('.')[0] + '_{}_{}_{}'.format(i, part_num, len(part)))
_, clusters = cluster_text(part, clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
cluster_vecs.append(clusters)
clusters_transmission_graph = nx.DiGraph()
for cluster_vec in tqdm(cluster_vecs):
if len(cluster_vec) >=2:
for i in range(len(cluster_vec) - 1):
left_clust = cluster_vec[i]
right_clust = cluster_vec[i + 1]
if clusters_transmission_graph.has_edge(left_clust, right_clust):
clusters_transmission_graph[left_clust][right_clust]['weight'] += 1
else:
clusters_transmission_graph.add_edge(left_clust, right_clust, weight=1)
fig, ax = plt.subplots()
fig.set_size_inches(50, 50)
curr_clust_names = {k:v for k,v in cluster_names.items() if k in list(clusters_transmission_graph.nodes)}
pos = nx.kamada_kawai_layout(clusters_transmission_graph)
nx.draw(clusters_transmission_graph, pos)
nx.draw_networkx_edges(clusters_transmission_graph, pos)
nx.draw_networkx_labels(clusters_transmission_graph, pos, curr_clust_names, font_size=30)
print('done')
from math import exp
all_weights = []
for (node1, node2, data) in clusters_transmission_graph.edges(data=True):
all_weights.append(data['weight']) #we'll use this when determining edge thickness
unique_weights = list(set(all_weights))
fig, ax = plt.subplots()
fig.set_size_inches(50, 50)
curr_clust_names = {k:v for k,v in cluster_names.items() if k in list(clusters_transmission_graph.nodes)}
pos = nx.kamada_kawai_layout(clusters_transmission_graph)
nx.draw_networkx_nodes(clusters_transmission_graph, pos)
nx.draw_networkx_labels(clusters_transmission_graph, pos, curr_clust_names, font_size=40)
for weight in unique_weights:
weighted_edges = [(node1,node2) for (node1, node2, edge_attr) in
clusters_transmission_graph.edges(data=True) if edge_attr['weight']==weight]
width = (weight * 50 / sum(all_weights))**3
nx.draw_networkx_edges(clusters_transmission_graph, pos, edgelist=weighted_edges, width=width)
btw_centrality = nx.centrality.betweenness_centrality(clusters_transmission_graph, normalized=True, weight='weight')
{cluster_names[k]: v for k, v in sorted(btw_centrality.items(), key=lambda item: -item[1])}
cluster_names[8]
# cluster_names = {}
# for curr_cluster in set(clust_model_kmeans.predict(X)):
# for curr_cluster in range(50):
for curr_cluster in [39]:
print(curr_cluster)
sents_in_this_cluster = [sent for sent, cluster in sents_with_clusters if cluster==curr_cluster]
for sent in random.choices(sents_in_this_cluster, k=20):
print(sent)
# cluster_names[curr_cluster] = input()
print('=========================\n=========================\n=========================\n\n\n')
node2vec = Node2Vec(clusters_transmission_graph, dimensions=2, walk_length=5, num_walks=5, workers=10)
embs = node2vec.fit(window=1, min_count=2)
import seaborn as sns
clust_to_vec_list = [embs[str(clust)] for clust in range(50)]
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
sns.scatterplot(x=[x for x, y in clust_to_vec_list],
y=[y for x, y in clust_to_vec_list])
plt.grid()
?cluster_text
def list_of_clusters_to_vec(clusters, clust_to_vec):
if len(clusters) > 0:
return np.mean([clust_to_vec[clust] for clust in clusters], axis=0)
else:
return np.array([0] * len(clust_to_vec['0']))
drama2vec = []
for i, drama in tqdm(enumerate(dramas)):
for part_num, part in enumerate(dramas_texts[i]):
_, clusters = cluster_text(part, clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
drama2vec.append(list_of_clusters_to_vec([str(clust) for clust in clusters], embs))
{dramas_name: i for i, dramas_name in enumerate(dramas_names) if 'gorky' in dramas_name}
# 563
cos_dis = pairwise_distances(drama2vec, drama2vec, metric='euclidean')
dis_dict = {dramas_names[i]: dis for i, dis in enumerate(cos_dis[698])}
{k: v for k, v in sorted(dis_dict.items(), key=lambda item: item[1])}
parsed_text, clusters1 = cluster_text(dramas_texts[37][18], clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
for person, rep in parsed_text:
print(person)
for phrase in rep:
print(phrase + ': ' + colored(rep[phrase], 'red'))
print('\n')
parsed_text, clusters1 = cluster_text(dramas_texts[10][20], clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
for person, rep in parsed_text:
print(person)
for phrase in rep:
print(phrase + ': ' + colored(rep[phrase], 'red'))
print('\n')
import seaborn as sns
fig, ax = plt.subplots()
fig.set_size_inches(30, 20)
# plt.xlim(0.6, 1.2)
# plt.ylim(-1.6, -0.8)
authors = [drama_name.split('-')[0] for drama_name in dramas_names]
authors_sizes = {author: 20 for author in authors}
authors_sizes['gogol'] = 200
sns.scatterplot(x=[x for x, y in drama2vec],
y=[y for x, y in drama2vec],
hue=authors,
size=authors,
sizes=authors_sizes)
plt.grid()
def ngrams_from_vec(vec, n=5):
output = []
if len(vec) <= n:
return [[]]
else:
return [vec[i: i+n] for i in range(len(vec)-n)]
# ngrams_from_vec(cluster_vecs[0], n=5)
103945
sum([v for k, v in ngram_counter.items()])
fig, ax = plt.subplots()
fig.set_size_inches(12, 10)
ax = sns.distplot([v for k, v in ngram_counter.items()], bins=136, kde=False)
ax.set_xlabel('Количество вхождений триграмма в корпус', fontsize=28)
ax.set_ylabel('Количество триграммов', fontsize=28)
all_ngrams = []
for clusters_vec in tqdm(cluster_vecs):
all_ngrams.extend(ngrams_from_vec(clusters_vec, n=3))
# ngram_counter = Counter([','.join([str(x) for x in ngram]) for ngram in all_ngrams if 39 in ngram])
# ngram_counter = Counter([', '.join([cluster_names.get(x, str(x)) + ' ' + str(x) for x in ngram]) for ngram in all_ngrams if ngram != []])
ngram_counter = Counter([','.join([cluster_names.get(x, str(x)) for x in ngram]) for ngram in all_ngrams if ngram != []])
for ng, cnt in list(({k: v for k, v in sorted(ngram_counter.items(), key=lambda item: -item[1])}).items())[:10]:
print('{}&{}&{}&{}\\\\\\hline'.format(ng.split(',')[0], ng.split(',')[1], ng.split(',')[2], cnt))
# {k: v for k, v in sorted(ngram_counter.items(), key=lambda item: -item[1])}
# import tensorflow as tf
# tf.compat.v1.enable_eager_execution()
ngrams_sents = {}
for i, drama in tqdm(enumerate(dramas)):
for part_num, part in enumerate(dramas_texts[i]):
# try:
text_with_clusters, clusters = cluster_text(part, clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
cluster_ngrams_for_this_text = [','.join([str(x) for x in ngram])
for ngram in ngrams_from_vec(clusters, n=3)]
sent_ngrams_for_this_text = ngrams_from_vec([list(sent)[0] for _, sent in text_with_clusters], n=3)
for ngram, sents_ngram in zip(cluster_ngrams_for_this_text, sent_ngrams_for_this_text):
if ngrams_sents.get(ngram) is not None:
ngrams_sents[ngram] = ngrams_sents[ngram] + [sents_ngram]
else:
ngrams_sents[ngram] = [sents_ngram]
# except:
# print(part_num)
for sents_set in ngrams_sents['14,38,14']:
print('\n\n'.join(sents_set))
print('============================\n'*3)
reducer = umap.UMAP()
embedding = reducer.fit_transform(vectors)
embedding.shape
len(sents_with_clusters)
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
plt.clf()
plt.xlim(4, 16)
plt.ylim(-2, 12.5)
draw_clusters = [11,47,22, 29,39]
cl_sizes = {cl: 20 for cl in cluster_names}
for cl in draw_clusters:
cl_sizes[cl] = 200
sns.scatterplot(x=[x for x, y in tqdm(embedding)],
y=[y for x, y in tqdm(embedding)],
hue=[cluster_names.get(cl) if cl in draw_clusters else 'other'
for _, cl in tqdm(sents_with_clusters)],
palette=sns.color_palette(n_colors=6))
# sns.scatterplot(x=[x for x, y in tqdm(embedding)],
# y=[y for x, y in tqdm(embedding)],
# hue=[cluster_names.get(cl) if cl in draw_clusters else 'other'
# for _, cl in tqdm(sents_with_clusters)],
# size=[cl for _, cl in tqdm(sents_with_clusters)],
# sizes=cl_sizes,
# legend=False)
plt.grid()
?sns.scatterplot
# language model
# cluster_vecs - порядки кластеров в диалогах
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Activation, Dropout
from tensorflow.keras.models import clone_model
clusters_num = 50
hidden_size = 10
model = Sequential()
model.add(LSTM(300, return_sequences=True, input_shape=(None, 512)))
model.add(Dropout(0.25))
model.add(TimeDistributed(Dense(128, activation='tanh')))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(32, activation='tanh'))
model.add(Dense(clusters_num, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
model.summary()
from keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True,show_layer_names=True)
!pip install pydot
X_lstm = [[clust_model_kmeans.cluster_centers_[cl] for cl in vec[:-1]]
for vec in cluster_vecs if len(vec) > 2]
# y_lstm = [to_categorical(vec[1:], num_classes=50) for vec in cluster_vecs if len(vec) > 2]
X_lstm_train, X_lstm_test, y_lstm_train, y_lstm_test = train_test_split(X_lstm, y_lstm,
test_size=0.2, random_state=42)
print(len(X_lstm_train))
def same_size_generator(X, y, batch_size=20):
while True:
X, y = shuffle(X, y)
X = np.array(X)
y = np.array(y)
sizes = np.array([len(x) for x in X])
single_sizes = shuffle(list(set(sizes)))
for size in single_sizes:
X_batch = X[np.where(sizes == size)]
X_batch = np.array([list(x) for x in X_batch])
y_batch = y[np.where(sizes == size)]
y_batch = np.array([list(y) for y in y_batch])
for index in range(0, len(X_batch), batch_size):
yield X_batch[index: index+batch_size], y_batch[index: index+batch_size]
model.fit_generator(same_size_generator(X_lstm_train, y_lstm_train, batch_size=100),
epochs=50,
steps_per_epoch=200,
validation_data=same_size_generator(X_lstm_test, y_lstm_test, batch_size=100),
validation_steps=40)
cluster_seq_emb = clone_model(model)
cluster_seq_emb.pop()
cluster_seq_emb.summary()
embs['1']
ngrams_list = []
ngrams_vecs = []
for ngram in tqdm(ngrams_sents):
if ngram != '':
ngrams_list.append(ngram)
ngrams_vecs.append([vec[-1] for vec in
cluster_seq_emb.predict([[[clust_model_kmeans.cluster_centers_[int(cl)] for cl
in ngram.split(',')]]])][-1])
reducer = umap.UMAP()
ngrams_embedding = reducer.fit_transform(ngrams_vecs)
ngrams_embedding.shape
# ngram_kmeans = KMeans(n_clusters=50, random_state=42)
ngram_clusters = ngram_kmeans.predict(ngrams_vecs)
fig, ax = plt.subplots()
fig.set_size_inches(30, 20)
plt.clf()
# plt.xlim(4, 17)
# plt.ylim(0, 20)
sns.scatterplot(x=[x for x, y in tqdm(ngrams_embedding)],
y=[y for x, y in tqdm(ngrams_embedding)],
hue=[ngram_clusters_names.get(cl) for cl in ngram_clusters],
legend=False)
plt.grid()
ngram_to_clusters = {ngram: cluster for ngram, cluster in zip(ngrams_list, ngram_clusters)}
ngram_clusters_names
huge_clusters_len = {i: 0 for i in range(50)}
for ngram, cl in list(zip(ngrams_list, ngram_clusters)):
huge_clusters_len[cl] += len(ngrams_sents[ngram])
{ngram_clusters_names[k] + ' {}'.format(k): v for k, v in sorted(huge_clusters_len.items(), key=lambda item: -item[1])}
for cluster in cluster_names:
# ngram_clusters_names = {}
ngram_with_clusters = list(zip(ngrams_list, ngram_clusters))
for curr_cluster in [24]:
# if curr_cluster not in ngram_clusters_names:
print(ngram_clusters_names.get(curr_cluster, curr_cluster))
ngram_in_this_cluster = [ngram for ngram, cluster in ngram_with_clusters if cluster==curr_cluster]
counter = 0
for ngram in sorted(ngram_in_this_cluster, key=lambda x: -len(ngrams_sents[x]))[:20]:
ngram_to_cluster_names = '"\\,,\\ "'.join([cluster_names[int(cl)] for cl in ngram.split(',')])
print('"{}" len:{}'.format(ngram_to_cluster_names, len(ngrams_sents[ngram])))
if counter <= 4:
for sents_set in ngrams_sents[ngram][:3]:
print('\n'.join(sents_set))
print('\n')
print('\n\n')
counter += 1
# ngram_clusters_names[curr_cluster] = input()
print('=========================\n=========================\n=========================\n\n\n')
ngram_clusters_names
authors_drama_pics = {}
authors_drama_names = {}
for i, drama in tqdm(enumerate(dramas)):
drama_pic = []
for part_num, part in enumerate(dramas_texts[i]):
try:
text_with_clusters, clusters = cluster_text(part, clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
cluster_ngrams_for_this_text = [','.join([str(x) for x in ngram])
for ngram in ngrams_from_vec(clusters, n=3)]
ngram_clusters = [ngram_to_clusters[ngram] for ngram in cluster_ngrams_for_this_text if ngram != '']
if len(ngram_clusters) > 0:
part_vec = list(np.sum([to_categorical(cl, 50) for cl in ngram_clusters], axis=0))
drama_pic.append(list(normalize([part_vec])[0]))
except:
pass
if len(drama_pic) > 2:
# sns.heatmap(drama_pic)
# plt.show()
# plt.clf()
drama_pic = resize(np.array(drama_pic), (5, 50))
if authors_drama_pics.get('0total') is None:
authors_drama_pics['0total'] = drama_pic
authors_drama_names['0total'] = [drama]
else:
authors_drama_pics['0total'] = authors_drama_pics['0total'] + drama_pic
authors_drama_names['0total'] = authors_drama_names['0total'] + [drama]
year = getRoundedThresholdv1(years[i], 50)
if authors_drama_pics.get(str(year)) is None:
authors_drama_pics[str(year)] = drama_pic
authors_drama_names[str(year)] = [drama]
else:
authors_drama_pics[str(year)] = authors_drama_pics[str(year)] + drama_pic
authors_drama_names[str(year)] = authors_drama_names[str(year)] + [drama]
if authors_drama_pics.get(drama.split('-')[0]) is None:
authors_drama_pics[drama.split('-')[0]] = drama_pic
authors_drama_names[drama.split('-')[0]] = [drama]
else:
authors_drama_pics[drama.split('-')[0]] = authors_drama_pics[drama.split('-')[0]] + drama_pic
authors_drama_names[drama.split('-')[0]] = authors_drama_names[drama.split('-')[0]] + [drama]
authors_drama_pics_fc = {}
for i, drama in tqdm(enumerate(dramas)):
drama_pic = []
for part_num, part in enumerate(dramas_texts[i]):
try:
text_with_clusters, clusters = cluster_text(part, clust_model_kmeans, sent_to_vector_dict=sent_to_vector_dict)
if len(clusters) > 0:
part_vec = list(np.sum([to_categorical(cl, 50) for cl in clusters], axis=0))
drama_pic.append(list(normalize([part_vec])[0]))
except:
pass
if len(drama_pic) > 2:
# sns.heatmap(drama_pic)
# plt.show()
# plt.clf()
drama_pic = resize(np.array(drama_pic), (5, 50))
if authors_drama_pics_fc.get('0total') is None:
authors_drama_pics_fc['0total'] = drama_pic
authors_drama_names['0total'] = [drama]
else:
authors_drama_pics_fc['0total'] = authors_drama_pics_fc['0total'] + drama_pic
authors_drama_names['0total'] = authors_drama_names['0total'] + [drama]
year = getRoundedThresholdv1(years[i], 50)
if authors_drama_pics_fc.get(str(year)) is None:
authors_drama_pics_fc[str(year)] = drama_pic
authors_drama_names[str(year)] = [drama]
else:
authors_drama_pics_fc[str(year)] = authors_drama_pics_fc[str(year)] + drama_pic
authors_drama_names[str(year)] = authors_drama_names[str(year)] + [drama]
if authors_drama_pics_fc.get(drama.split('-')[0]) is None:
authors_drama_pics_fc[drama.split('-')[0]] = drama_pic
authors_drama_names[drama.split('-')[0]] = [drama]
else:
authors_drama_pics_fc[drama.split('-')[0]] = authors_drama_pics_fc[drama.split('-')[0]] + drama_pic
authors_drama_names[drama.split('-')[0]] = authors_drama_names[drama.split('-')[0]] + [drama]
getRoundedThresholdv1(1719, 50)
for author in sorted(authors_drama_pics):
print(author)
print(authors_drama_names[author][:10])
for theme in [ngram_clusters_names[np.argmax(line)] for line in authors_drama_pics[author]]:
print(theme)
sns.heatmap(authors_drama_pics[author])
plt.show()
plt.clf()
cluster_names[45] = 'Короткий утвердительный ответ/вопрос'
len(sorted(set([author.split('-')[0] for author in authors_drama_names['1950']])))
np.max(drama_pic)
sns.set(font_scale=1.5)
for author in sorted(authors_drama_pics_fc):
print(author)
print(authors_drama_names[author][:10])
for theme in [cluster_names[np.argmax(line)] for line in authors_drama_pics_fc[author]]:
print(theme)
fig, axes = plt.subplots(1, 2)
fig.set_size_inches(5, 8)
cl_names_list = [v for _, v in sorted(cluster_names.items(), key=lambda item: item[0])]
max_value = np.max(normalize(authors_drama_pics_fc[author]))
hm = sns.heatmap(np.transpose(normalize(authors_drama_pics_fc[author]))[:25],
yticklabels=cl_names_list[:25],
xticklabels=['Начало', '', 'Середина', '' , 'Конец'], ax=axes[0],
vmin=.0, vmax=max_value, cbar_kws = dict(use_gridspec=False,location="top"))
hm.set_xticklabels(hm.get_xticklabels(), rotation=270, horizontalalignment='left')
hm.set_yticklabels(hm.get_yticklabels(), rotation=0, horizontalalignment='right')
hm = sns.heatmap(np.transpose(normalize(authors_drama_pics_fc[author]))[25:],
yticklabels=cl_names_list[25:],
xticklabels=['Начало', '', 'Середина', '' , 'Конец'], ax=axes[1],
vmin=.0, vmax=max_value, cbar_kws = dict(use_gridspec=False,location="top"))
hm.set_xticklabels(hm.get_xticklabels(), rotation=270, horizontalalignment='center')
hm.set_yticklabels(hm.get_yticklabels(), rotation=0, horizontalalignment='left')
hm.yaxis.tick_right()
plt.show()
plt.clf()
sents_sets_for_clusters = {}
for ngram, sents_sets in tqdm(ngrams_sents.items()):
if ngram != '':
huge_cluster = ngram_to_clusters[ngram]
if sents_sets_for_clusters.get(huge_cluster) is None:
sents_sets_for_clusters[huge_cluster] = sents_sets
else:
sents_sets_for_clusters[huge_cluster] = sents_sets_for_clusters[huge_cluster] + sents_sets
# ngram_clusters_names[49] = 'Диалоги с территориальными вопросами (о движении/местонахождении)'
clusters_quality = {}
for curr_cluster in [49, 28, 1]:
cluster_sents = sents_sets_for_clusters[curr_cluster]
curr_clust_q = clusters_quality.get(curr_cluster, {'good': 0, 'bad': 0, 'percent': 0})
print (curr_clust_q)
for i, sent in enumerate(random.choices(cluster_sents, k=100)):
print(ngram_clusters_names[curr_cluster] + '_' + str(i))
print('\n')
print('\n'.join(sent))
goodness = int(input())
if goodness == 1:
curr_clust_q['good'] += 1
else:
curr_clust_q['bad'] += 1
curr_clust_q['percent'] = curr_clust_q['good'] / (curr_clust_q['good'] + curr_clust_q['bad'])
print(curr_clust_q)
print('=================================')
clusters_quality[curr_cluster] = curr_clust_q
if curr_clust_q['good'] + curr_clust_q['bad'] > 100:
break
clusters_quality
ngram_clusters_names[49]
for cl in clusters_quality:
print(ngram_clusters_names[cl])
print(clusters_quality[cl])
print('\n')
# {'blok-neznakomka_6_0_60': 117,
# 'blok-neznakomka_6_1_76': 118,
# 'blok-neznakomka_6_2_55': 119,
# 'blok-korol-na-ploschadi_30_0_92': 590,
# 'blok-korol-na-ploschadi_30_1_86': 591,
# 'blok-korol-na-ploschadi_30_2_95': 592,
# 'blok-balaganchik_168_0_77': 3730}
def final_cluster_text(per_text, cluster_model, cluster_seq_emb, ngram_kmeans, cluster_names={}, sent_to_vector_dict={}):
output = []
all_clusters = []
for person, sent in per_text:
if sent in sent_to_vector_dict:
sent_vec = sent_to_vector_dict[sent]
else:
sent_vec = get_sent_vector(sent)
if sent_vec is not None:
cluster = clust_model_kmeans.predict([sent_vec])[0]
sents = {sent: cluster_names.get(cluster, str(cluster))}
output.append((person, sents))
all_clusters.append(cluster)
ngrams = ngrams_from_vec(all_clusters, n=3)
ngram_vecs = [cluster_seq_emb.predict([[[clust_model_kmeans.cluster_centers_[cl] for cl in ngram]]])[0][-1] for ngram in ngrams]
huge_clusters_in_this_text = ngram_kmeans.predict(ngram_vecs)
return output, all_clusters, huge_clusters_in_this_text
parsed_text, _, huge_clusters = final_cluster_text(dramas_texts[96][0],
clust_model_kmeans,
cluster_seq_emb,
ngram_kmeans,
cluster_names,
sent_to_vector_dict=sent_to_vector_dict)
for ngram_name in [ngram_clusters_names[cl] for cl in huge_clusters]:
print('{} -> '.format(ngram_name))
print('\n')
i = -2
for person, rep in parsed_text:
print(person)
for phrase in rep:
huge_clusters_for_this_phrase = set([ngram_clusters_names[cl] for cl in huge_clusters[max(0, i): i+3]])
print(phrase + ': ' + colored(rep[phrase], 'red') + '\t' + colored(huge_clusters_for_this_phrase, 'green'))
i += 1
print('\n')
len(parsed_text)
all_clusters = _
ngrams = ngrams_from_vec(all_clusters)
ngram_vecs = [cluster_seq_emb.predict([[[clust_model_kmeans.cluster_centers_[cl] for cl in ngram]]])[0][-1] for ngram in ngrams]
huge_clusters_in_this_text = ngram_kmeans.predict(ngram_vecs)
huge_clusters_in_this_text
cluster_seq_emb.predict([[[clust_model_kmeans.cluster_centers_[cl] for cl in [31,41,24]]]])[0][-1]
cluster_names
cluster_seq_emb.save('cluster_seq_emb.h5')
with open('kmeans_phrases.pkl', 'wb') as f:
pickle.dump(clust_model_kmeans, f)
with open('kmeans_phrases_names.pkl', 'wb') as f:
pickle.dump(cluster_names, f)
with open('kmeans_seqs.pkl', 'wb') as f:
pickle.dump(ngram_kmeans, f)
with open('kmeans_seqs_names.pkl', 'wb') as f:
pickle.dump(ngram_clusters_names, f)